cache.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346
  1. """Cache Management
  2. """
  3. import hashlib
  4. import json
  5. import logging
  6. import os
  7. from pip._vendor.packaging.tags import interpreter_name, interpreter_version
  8. from pip._vendor.packaging.utils import canonicalize_name
  9. from pip._internal.exceptions import InvalidWheelFilename
  10. from pip._internal.models.link import Link
  11. from pip._internal.models.wheel import Wheel
  12. from pip._internal.utils.temp_dir import TempDirectory, tempdir_kinds
  13. from pip._internal.utils.typing import MYPY_CHECK_RUNNING
  14. from pip._internal.utils.urls import path_to_url
  15. if MYPY_CHECK_RUNNING:
  16. from typing import Optional, Set, List, Any, Dict
  17. from pip._vendor.packaging.tags import Tag
  18. from pip._internal.models.format_control import FormatControl
  19. logger = logging.getLogger(__name__)
  20. def _hash_dict(d):
  21. # type: (Dict[str, str]) -> str
  22. """Return a stable sha224 of a dictionary."""
  23. s = json.dumps(d, sort_keys=True, separators=(",", ":"), ensure_ascii=True)
  24. return hashlib.sha224(s.encode("ascii")).hexdigest()
  25. class Cache(object):
  26. """An abstract class - provides cache directories for data from links
  27. :param cache_dir: The root of the cache.
  28. :param format_control: An object of FormatControl class to limit
  29. binaries being read from the cache.
  30. :param allowed_formats: which formats of files the cache should store.
  31. ('binary' and 'source' are the only allowed values)
  32. """
  33. def __init__(self, cache_dir, format_control, allowed_formats):
  34. # type: (str, FormatControl, Set[str]) -> None
  35. super(Cache, self).__init__()
  36. assert not cache_dir or os.path.isabs(cache_dir)
  37. self.cache_dir = cache_dir or None
  38. self.format_control = format_control
  39. self.allowed_formats = allowed_formats
  40. _valid_formats = {"source", "binary"}
  41. assert self.allowed_formats.union(_valid_formats) == _valid_formats
  42. def _get_cache_path_parts_legacy(self, link):
  43. # type: (Link) -> List[str]
  44. """Get parts of part that must be os.path.joined with cache_dir
  45. Legacy cache key (pip < 20) for compatibility with older caches.
  46. """
  47. # We want to generate an url to use as our cache key, we don't want to
  48. # just re-use the URL because it might have other items in the fragment
  49. # and we don't care about those.
  50. key_parts = [link.url_without_fragment]
  51. if link.hash_name is not None and link.hash is not None:
  52. key_parts.append("=".join([link.hash_name, link.hash]))
  53. key_url = "#".join(key_parts)
  54. # Encode our key url with sha224, we'll use this because it has similar
  55. # security properties to sha256, but with a shorter total output (and
  56. # thus less secure). However the differences don't make a lot of
  57. # difference for our use case here.
  58. hashed = hashlib.sha224(key_url.encode()).hexdigest()
  59. # We want to nest the directories some to prevent having a ton of top
  60. # level directories where we might run out of sub directories on some
  61. # FS.
  62. parts = [hashed[:2], hashed[2:4], hashed[4:6], hashed[6:]]
  63. return parts
  64. def _get_cache_path_parts(self, link):
  65. # type: (Link) -> List[str]
  66. """Get parts of part that must be os.path.joined with cache_dir
  67. """
  68. # We want to generate an url to use as our cache key, we don't want to
  69. # just re-use the URL because it might have other items in the fragment
  70. # and we don't care about those.
  71. key_parts = {"url": link.url_without_fragment}
  72. if link.hash_name is not None and link.hash is not None:
  73. key_parts[link.hash_name] = link.hash
  74. if link.subdirectory_fragment:
  75. key_parts["subdirectory"] = link.subdirectory_fragment
  76. # Include interpreter name, major and minor version in cache key
  77. # to cope with ill-behaved sdists that build a different wheel
  78. # depending on the python version their setup.py is being run on,
  79. # and don't encode the difference in compatibility tags.
  80. # https://github.com/pypa/pip/issues/7296
  81. key_parts["interpreter_name"] = interpreter_name()
  82. key_parts["interpreter_version"] = interpreter_version()
  83. # Encode our key url with sha224, we'll use this because it has similar
  84. # security properties to sha256, but with a shorter total output (and
  85. # thus less secure). However the differences don't make a lot of
  86. # difference for our use case here.
  87. hashed = _hash_dict(key_parts)
  88. # We want to nest the directories some to prevent having a ton of top
  89. # level directories where we might run out of sub directories on some
  90. # FS.
  91. parts = [hashed[:2], hashed[2:4], hashed[4:6], hashed[6:]]
  92. return parts
  93. def _get_candidates(self, link, canonical_package_name):
  94. # type: (Link, str) -> List[Any]
  95. can_not_cache = (
  96. not self.cache_dir or
  97. not canonical_package_name or
  98. not link
  99. )
  100. if can_not_cache:
  101. return []
  102. formats = self.format_control.get_allowed_formats(
  103. canonical_package_name
  104. )
  105. if not self.allowed_formats.intersection(formats):
  106. return []
  107. candidates = []
  108. path = self.get_path_for_link(link)
  109. if os.path.isdir(path):
  110. for candidate in os.listdir(path):
  111. candidates.append((candidate, path))
  112. # TODO remove legacy path lookup in pip>=21
  113. legacy_path = self.get_path_for_link_legacy(link)
  114. if os.path.isdir(legacy_path):
  115. for candidate in os.listdir(legacy_path):
  116. candidates.append((candidate, legacy_path))
  117. return candidates
  118. def get_path_for_link_legacy(self, link):
  119. # type: (Link) -> str
  120. raise NotImplementedError()
  121. def get_path_for_link(self, link):
  122. # type: (Link) -> str
  123. """Return a directory to store cached items in for link.
  124. """
  125. raise NotImplementedError()
  126. def get(
  127. self,
  128. link, # type: Link
  129. package_name, # type: Optional[str]
  130. supported_tags, # type: List[Tag]
  131. ):
  132. # type: (...) -> Link
  133. """Returns a link to a cached item if it exists, otherwise returns the
  134. passed link.
  135. """
  136. raise NotImplementedError()
  137. class SimpleWheelCache(Cache):
  138. """A cache of wheels for future installs.
  139. """
  140. def __init__(self, cache_dir, format_control):
  141. # type: (str, FormatControl) -> None
  142. super(SimpleWheelCache, self).__init__(
  143. cache_dir, format_control, {"binary"}
  144. )
  145. def get_path_for_link_legacy(self, link):
  146. # type: (Link) -> str
  147. parts = self._get_cache_path_parts_legacy(link)
  148. assert self.cache_dir
  149. return os.path.join(self.cache_dir, "wheels", *parts)
  150. def get_path_for_link(self, link):
  151. # type: (Link) -> str
  152. """Return a directory to store cached wheels for link
  153. Because there are M wheels for any one sdist, we provide a directory
  154. to cache them in, and then consult that directory when looking up
  155. cache hits.
  156. We only insert things into the cache if they have plausible version
  157. numbers, so that we don't contaminate the cache with things that were
  158. not unique. E.g. ./package might have dozens of installs done for it
  159. and build a version of 0.0...and if we built and cached a wheel, we'd
  160. end up using the same wheel even if the source has been edited.
  161. :param link: The link of the sdist for which this will cache wheels.
  162. """
  163. parts = self._get_cache_path_parts(link)
  164. assert self.cache_dir
  165. # Store wheels within the root cache_dir
  166. return os.path.join(self.cache_dir, "wheels", *parts)
  167. def get(
  168. self,
  169. link, # type: Link
  170. package_name, # type: Optional[str]
  171. supported_tags, # type: List[Tag]
  172. ):
  173. # type: (...) -> Link
  174. candidates = []
  175. if not package_name:
  176. return link
  177. canonical_package_name = canonicalize_name(package_name)
  178. for wheel_name, wheel_dir in self._get_candidates(
  179. link, canonical_package_name
  180. ):
  181. try:
  182. wheel = Wheel(wheel_name)
  183. except InvalidWheelFilename:
  184. continue
  185. if canonicalize_name(wheel.name) != canonical_package_name:
  186. logger.debug(
  187. "Ignoring cached wheel %s for %s as it "
  188. "does not match the expected distribution name %s.",
  189. wheel_name, link, package_name,
  190. )
  191. continue
  192. if not wheel.supported(supported_tags):
  193. # Built for a different python/arch/etc
  194. continue
  195. candidates.append(
  196. (
  197. wheel.support_index_min(supported_tags),
  198. wheel_name,
  199. wheel_dir,
  200. )
  201. )
  202. if not candidates:
  203. return link
  204. _, wheel_name, wheel_dir = min(candidates)
  205. return Link(path_to_url(os.path.join(wheel_dir, wheel_name)))
  206. class EphemWheelCache(SimpleWheelCache):
  207. """A SimpleWheelCache that creates it's own temporary cache directory
  208. """
  209. def __init__(self, format_control):
  210. # type: (FormatControl) -> None
  211. self._temp_dir = TempDirectory(
  212. kind=tempdir_kinds.EPHEM_WHEEL_CACHE,
  213. globally_managed=True,
  214. )
  215. super(EphemWheelCache, self).__init__(
  216. self._temp_dir.path, format_control
  217. )
  218. class CacheEntry(object):
  219. def __init__(
  220. self,
  221. link, # type: Link
  222. persistent, # type: bool
  223. ):
  224. self.link = link
  225. self.persistent = persistent
  226. class WheelCache(Cache):
  227. """Wraps EphemWheelCache and SimpleWheelCache into a single Cache
  228. This Cache allows for gracefully degradation, using the ephem wheel cache
  229. when a certain link is not found in the simple wheel cache first.
  230. """
  231. def __init__(self, cache_dir, format_control):
  232. # type: (str, FormatControl) -> None
  233. super(WheelCache, self).__init__(
  234. cache_dir, format_control, {'binary'}
  235. )
  236. self._wheel_cache = SimpleWheelCache(cache_dir, format_control)
  237. self._ephem_cache = EphemWheelCache(format_control)
  238. def get_path_for_link_legacy(self, link):
  239. # type: (Link) -> str
  240. return self._wheel_cache.get_path_for_link_legacy(link)
  241. def get_path_for_link(self, link):
  242. # type: (Link) -> str
  243. return self._wheel_cache.get_path_for_link(link)
  244. def get_ephem_path_for_link(self, link):
  245. # type: (Link) -> str
  246. return self._ephem_cache.get_path_for_link(link)
  247. def get(
  248. self,
  249. link, # type: Link
  250. package_name, # type: Optional[str]
  251. supported_tags, # type: List[Tag]
  252. ):
  253. # type: (...) -> Link
  254. cache_entry = self.get_cache_entry(link, package_name, supported_tags)
  255. if cache_entry is None:
  256. return link
  257. return cache_entry.link
  258. def get_cache_entry(
  259. self,
  260. link, # type: Link
  261. package_name, # type: Optional[str]
  262. supported_tags, # type: List[Tag]
  263. ):
  264. # type: (...) -> Optional[CacheEntry]
  265. """Returns a CacheEntry with a link to a cached item if it exists or
  266. None. The cache entry indicates if the item was found in the persistent
  267. or ephemeral cache.
  268. """
  269. retval = self._wheel_cache.get(
  270. link=link,
  271. package_name=package_name,
  272. supported_tags=supported_tags,
  273. )
  274. if retval is not link:
  275. return CacheEntry(retval, persistent=True)
  276. retval = self._ephem_cache.get(
  277. link=link,
  278. package_name=package_name,
  279. supported_tags=supported_tags,
  280. )
  281. if retval is not link:
  282. return CacheEntry(retval, persistent=False)
  283. return None